// -*- Mode: c -*-
//
/*
 * A module that allows the reading of performance counters on Pentium.
 *
 * This file contains both code that uses the performance counters to 
 * compute the number of cycles per second (to be used during ./configure) 
 * and also code to read the performance counters from Ocaml.
 *
 * Author: George Necula (necula@cs.berkeley.edu)
 */
#include <stdio.h>
#include <string.h>
/* Store here the first value read from the performance counter */
unsigned static long long first_value;

/* This is the function that actually reads the performance counter. */
inline static unsigned long long read_ppc() {
  unsigned long low, high;
  unsigned long long lowhigh;

#if !defined(__i386__)
  printf ("Reading of performance counters is supported only on Intel x86\n");
  exit(1);
#else

  // RDTSC puts the result in EAX and EDX. We tell gcc to use those registers
  // for "low" and "high"
  __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));

  // printf ("Read low=0x%08lx high=0x%08lx\n", low, high);

  // Put the 64-bit value together
  lowhigh = ((unsigned long long)low) | ((unsigned long long)high << 32);

  if(first_value == 0) {
    first_value = lowhigh;
  }
  return lowhigh - first_value;
#endif
}


/* sm: I want a version that is as fast as possible, dropping
 * bits that aren't very important to achieve it. *
 *
 * This version drops the low 20 bits and the high 14 bits so the
 * result is 30 bits (always a positive Ocaml int); this yields
 * megacycles, which for GHz machines will be something like
 * milliseconds. */
inline static unsigned long sample_ppc_20()
{
  unsigned long low, high;
#if defined(__i386__)
  __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));
#else  
  printf("Reading performance counters is supported only on x86");
  exit(1);
#endif  

  return ((low >> 20) | (high << 12)) & 0x3FFFFFFF;
}

/* This version drops the low 10 bits, yielding something like
 * microseconds. */
inline static unsigned long sample_ppc_10()
{
  unsigned long low, high;
#if defined(__i386__)
  __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));
#else  
  printf("Reading performance counters is supported only on x86");
  exit(1);
#endif  

  return ((low >> 10) | (high << 22)) & 0x3FFFFFFF;
}



#ifndef CONFIGURATION_ONLY
/*** This is the OCAML stub for the read_ppc ***/
#include <caml/mlvalues.h>
#include <caml/alloc.h>
#include <caml/memory.h>

#define CYCLES_PER_USEC @CYCLES_PER_USEC@
value read_pentium_perfcount()
{
  double counter = (double)read_ppc() / (1000000.0 * CYCLES_PER_USEC);
  return copy_double(counter);
}

/* The Ocaml system can use this function to figure out if there are
 * performance counters available */
value has_performance_counters() {
  // HAS_PERFCOUNT is set by the configuration code at the end of
  // this file, during ./configure
#if @HAS_PERFCOUNT@ != 0
  return Val_true;
#else
  return Val_false;
#endif
}

/* sm: interface to above from Ocaml */
value sample_pentium_perfcount_20()
{
  return Val_long(sample_ppc_20());
}

value sample_pentium_perfcount_10()
{
  return Val_long(sample_ppc_10());
}

#endif


/* Now we have a function that tries to compute the number of cycles per 
 * second (to be used during ./configure) */
#ifdef CONFIGURATION_ONLY
#include <sys/times.h>
#include <unistd.h>
#include <math.h>

int main() {
  struct tms t;
  clock_t start, finish, diff;
  unsigned long long start_pc, finish_pc, diff_pc;
  long clk_per_sec = sysconf(_SC_CLK_TCK);
  double cycles_per_usec;
  
  if(clk_per_sec <= 0) {
    printf("Cannot find clk_per_sec (got %ld)\n", clk_per_sec);
    exit(1);
  }
  
  times(&t); start = t.tms_utime;
  start_pc = read_ppc();
  // Do something for a while
  {
    int i;
    double a = 5.678;
    for(i=0;i<10000000;i++) {
      a = (i & 1) ? (a * a) : (sqrt(a));
    }
  }
  times(&t); finish = t.tms_utime;
  finish_pc = read_ppc();
  diff = finish - start;
  diff_pc = finish_pc - start_pc;
  if(diff == 0) {
    printf("Cannot use Unix.times\n");
    exit(1);
  }
  if(diff_pc == 0) {
    printf("Invalid result from the peformance counters\n");
    exit(1);
  }
  diff_pc /= 1000000; // We care about cycles per microsecond
//  printf("diff = %ld, diff_pc = %ld, clk = %ld\n",
//         (long)diff,
//         (long)diff_pc, (long)clk_per_sec);

  cycles_per_usec = (((double)diff_pc / (double)diff)
                               * (double)clk_per_sec);

  /* Whatever value we print here will be used as the CYCLES_PER_USEC
   * below */
  printf("%.3lf\n", cycles_per_usec);
  exit(0);
}
#endif //defined CONFIGURATION_ONLY

